{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Vector Representations of Words"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Copyright 2015 The TensorFlow Authors. All Rights Reserved.\n",
    "#\n",
    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
    "# you may not use this file except in compliance with the License.\n",
    "# You may obtain a copy of the License at\n",
    "#\n",
    "#     http://www.apache.org/licenses/LICENSE-2.0\n",
    "#\n",
    "# Unless required by applicable law or agreed to in writing, software\n",
    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
    "# See the License for the specific language governing permissions and\n",
    "# limitations under the License.\n",
    "# ==============================================================================\n",
    "\n",
    "from __future__ import absolute_import\n",
    "from __future__ import division\n",
    "from __future__ import print_function\n",
    "\n",
    "import collections\n",
    "import math\n",
    "import os\n",
    "import random\n",
    "import zipfile\n",
    "\n",
    "import numpy as np\n",
    "from six.moves import urllib\n",
    "from six.moves import xrange  # pylint: disable=redefined-builtin\n",
    "import tensorflow as tf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Found and verified text8.zip\n",
      "Data size 17005207\n"
     ]
    }
   ],
   "source": [
    "# Step 1: Download the data.\n",
    "url = 'http://mattmahoney.net/dc/'\n",
    "\n",
    "\n",
    "def maybe_download(filename, expected_bytes):\n",
    "  \"\"\"Download a file if not present, and make sure it's the right size.\"\"\"\n",
    "  if not os.path.exists(filename):\n",
    "    filename, _ = urllib.request.urlretrieve(url + filename, filename)\n",
    "  statinfo = os.stat(filename)\n",
    "  if statinfo.st_size == expected_bytes:\n",
    "    print('Found and verified', filename)\n",
    "  else:\n",
    "    print(statinfo.st_size)\n",
    "    raise Exception(\n",
    "        'Failed to verify ' + filename + '. Can you get to it with a browser?')\n",
    "  return filename\n",
    "\n",
    "filename = maybe_download('text8.zip', 31344016)\n",
    "\n",
    "\n",
    "# Read the data into a list of strings.\n",
    "def read_data(filename):\n",
    "  \"\"\"Extract the first file enclosed in a zip file as a list of words\"\"\"\n",
    "  with zipfile.ZipFile(filename) as f:\n",
    "    data = tf.compat.as_str(f.read(f.namelist()[0])).split()\n",
    "  return data\n",
    "\n",
    "words = read_data(filename)\n",
    "print('Data size', len(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "words =  ['originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against', 'early', 'working', 'class', 'radicals', 'including', 'the', 'diggers', 'of', 'the', 'english', 'revolution', 'and', 'the', 'sans', 'culottes', 'of', 'the', 'french', 'revolution', 'whilst', 'the', 'term', 'is', 'still', 'used', 'in', 'a', 'pejorative', 'way', 'to', 'describe', 'any', 'act', 'that', 'used', 'violent', 'means', 'to', 'destroy', 'the', 'organization', 'of', 'society', 'it', 'has', 'also', 'been', 'taken', 'up', 'as', 'a', 'positive', 'label', 'by', 'self', 'defined', 'anarchists', 'the', 'word', 'anarchism', 'is', 'derived', 'from', 'the', 'greek', 'without', 'archons', 'ruler', 'chief', 'king', 'anarchism', 'as', 'a', 'political', 'philosophy', 'is', 'the', 'belief', 'that', 'rulers', 'are', 'unnecessary', 'and', 'should', 'be', 'abolished', 'although', 'there', 'are', 'differing', 'interpretations', 'of', 'what', 'this', 'means', 'anarchism', 'also', 'refers', 'to', 'related', 'social', 'movements', 'that', 'advocate', 'the', 'elimination', 'of', 'authoritarian', 'institutions', 'particularly', 'the', 'state', 'the', 'word', 'anarchy', 'as', 'most', 'anarchists', 'use', 'it', 'does', 'not', 'imply', 'chaos', 'nihilism', 'or', 'anomie', 'but', 'rather', 'a', 'harmonious', 'anti', 'authoritarian', 'society', 'in', 'place', 'of', 'what', 'are', 'regarded', 'as', 'authoritarian', 'political', 'structures', 'and', 'coercive', 'economic', 'institutions', 'anarchists', 'advocate', 'social', 'relations', 'based', 'upon', 'voluntary', 'association', 'of', 'autonomous', 'individuals', 'mutual', 'aid', 'and', 'self', 'governance', 'while', 'anarchism', 'is', 'most', 'easily', 'defined', 'by', 'what', 'it', 'is', 'against', 'anarchists', 'also', 'offer', 'positive', 'visions', 'of', 'what', 'they', 'believe', 'to', 'be', 'a', 'truly', 'free', 'society', 'however', 'ideas', 'about', 'how', 'an', 'anarchist', 'society', 'might', 'work', 'vary', 'considerably', 'especially', 'with', 'respect', 'to', 'economics', 'there', 'is', 'also', 'disagreement', 'about', 'how', 'a', 'free', 'society', 'might', 'be', 'brought', 'about', 'origins', 'and', 'predecessors', 'kropotkin', 'and', 'others', 'argue', 'that', 'before', 'recorded', 'history', 'human', 'society', 'was', 'organized', 'on', 'anarchist', 'principles', 'most', 'anthropologists', 'follow', 'kropotkin', 'and', 'engels', 'in', 'believing', 'that', 'hunter', 'gatherer', 'bands', 'were', 'egalitarian', 'and', 'lacked', 'division', 'of', 'labour', 'accumulated', 'wealth', 'or', 'decreed', 'law', 'and', 'had', 'equal', 'access', 'to', 'resources', 'william', 'godwin', 'anarchists', 'including', 'the', 'the', 'anarchy', 'organisation', 'and', 'rothbard', 'find', 'anarchist', 'attitudes', 'in', 'taoism', 'from', 'ancient', 'china', 'kropotkin', 'found', 'similar', 'ideas', 'in', 'stoic', 'zeno', 'of', 'citium', 'according', 'to', 'kropotkin', 'zeno', 'repudiated', 'the', 'omnipotence', 'of', 'the', 'state', 'its', 'intervention', 'and', 'regimentation', 'and', 'proclaimed', 'the', 'sovereignty', 'of', 'the', 'moral', 'law', 'of', 'the', 'individual', 'the', 'anabaptists', 'of', 'one', 'six', 'th', 'century', 'europe', 'are', 'sometimes', 'considered', 'to', 'be', 'religious', 'forerunners', 'of', 'modern', 'anarchism', 'bertrand', 'russell', 'in', 'his', 'history', 'of', 'western', 'philosophy', 'writes', 'that', 'the', 'anabaptists', 'repudiated', 'all', 'law', 'since', 'they', 'held', 'that', 'the', 'good', 'man', 'will', 'be', 'guided', 'at', 'every', 'moment', 'by', 'the', 'holy', 'spirit', 'from', 'this', 'premise', 'they', 'arrive', 'at', 'communism', 'the', 'diggers', 'or', 'true', 'levellers', 'were', 'an', 'early', 'communistic', 'movement', 'during', 'the', 'time', 'of', 'the', 'english', 'civil', 'war', 'and', 'are', 'considered', 'by', 'some', 'as', 'forerunners', 'of', 'modern', 'anarchism', 'in', 'the', 'modern', 'era', 'the', 'first', 'to', 'use', 'the', 'term', 'to', 'mean', 'something', 'other', 'than', 'chaos', 'was', 'louis', 'armand', 'baron', 'de', 'lahontan', 'in', 'his', 'nouveaux', 'voyages', 'dans', 'l', 'am', 'rique', 'septentrionale', 'one', 'seven', 'zero', 'three', 'where', 'he', 'described', 'the', 'indigenous', 'american', 'society', 'which', 'had', 'no', 'state', 'laws', 'prisons', 'priests', 'or', 'private', 'property', 'as', 'being', 'in', 'anarchy', 'russell', 'means', 'a', 'libertarian', 'and', 'leader', 'in', 'the', 'american', 'indian', 'movement', 'has', 'repeatedly', 'stated', 'that', 'he', 'is', 'an', 'anarchist', 'and', 'so', 'are', 'all', 'his', 'ancestors', 'in', 'one', 'seven', 'nine', 'three', 'in', 'the', 'thick', 'of', 'the', 'french', 'revolution', 'william', 'godwin', 'published', 'an', 'enquiry', 'concerning', 'political', 'justice', 'although', 'godwin', 'did', 'not', 'use', 'the', 'word', 'anarchism', 'many', 'later', 'anarchists', 'have', 'regarded', 'this', 'book', 'as', 'the', 'first', 'major', 'anarchist', 'text', 'and', 'godwin', 'as', 'the', 'founder', 'of', 'philosophical', 'anarchism', 'but', 'at', 'this', 'point', 'no', 'anarchist', 'movement', 'yet', 'existed', 'and', 'the', 'term', 'anarchiste', 'was', 'known', 'mainly', 'as', 'an', 'insult', 'hurled', 'by', 'the', 'bourgeois', 'girondins', 'at', 'more', 'radical', 'elements', 'in', 'the', 'french', 'revolution', 'the', 'first', 'self', 'labelled', 'anarchist', 'pierre', 'joseph', 'proudhon', 'it', 'is', 'commonly', 'held', 'that', 'it', 'wasn', 't', 'until', 'pierre', 'joseph', 'proudhon', 'published', 'what', 'is', 'property', 'in', 'one', 'eight', 'four', 'zero', 'that', 'the', 'term', 'anarchist', 'was', 'adopted', 'as', 'a', 'self', 'description', 'it', 'is', 'for', 'this', 'reason', 'that', 'some', 'claim', 'proudhon', 'as', 'the', 'founder', 'of', 'modern', 'anarchist', 'theory', 'in', 'what', 'is', 'property', 'proudhon', 'answers', 'with', 'the', 'famous', 'accusation', 'property', 'is', 'theft', 'in', 'this', 'work', 'he', 'opposed', 'the', 'institution', 'of', 'decreed', 'property', 'propri', 't', 'where', 'owners', 'have', 'complete', 'rights', 'to', 'use', 'and', 'abuse', 'their', 'property', 'as', 'they', 'wish', 'such', 'as', 'exploiting', 'workers', 'for', 'profit', 'in', 'its', 'place', 'proudhon', 'supported', 'what', 'he', 'called', 'possession', 'individuals', 'can', 'have', 'limited', 'rights', 'to', 'use', 'resources', 'capital', 'and', 'goods', 'in', 'accordance', 'with', 'principles', 'of', 'equality', 'and', 'justice', 'proudhon', 's', 'vision', 'of', 'anarchy', 'which', 'he', 'called', 'mutualism', 'mutuellisme', 'involved', 'an', 'exchange', 'economy', 'where', 'individuals', 'and', 'groups', 'could', 'trade', 'the', 'products', 'of', 'their', 'labor', 'using', 'labor', 'notes', 'which', 'represented', 'the', 'amount', 'of', 'working', 'time', 'involved', 'in', 'production', 'this', 'would', 'ensure', 'that', 'no', 'one', 'would', 'profit', 'from', 'the', 'labor', 'of', 'others', 'workers', 'could', 'freely', 'join', 'together', 'in', 'co', 'operative', 'workshops', 'an', 'interest', 'free', 'bank', 'would', 'be', 'set', 'up', 'to', 'provide', 'everyone', 'with', 'access', 'to', 'the', 'means', 'of', 'production', 'proudhon', 's', 'ideas', 'were', 'influential', 'within', 'french', 'working', 'class', 'movements', 'and', 'his', 'followers', 'were', 'active', 'in', 'the', 'revolution', 'of', 'one', 'eight', 'four', 'eight', 'in', 'france', 'proudhon', 's', 'philosophy', 'of', 'property', 'is', 'complex', 'it', 'was', 'developed', 'in', 'a', 'number', 'of', 'works', 'over', 'his', 'lifetime', 'and', 'there', 'are', 'differing', 'interpretations', 'of', 'some', 'of', 'his', 'ideas', 'for', 'more', 'detailed', 'discussion', 'see', 'here', 'max', 'stirner', 's', 'egoism', 'in', 'his', 'the', 'ego', 'and', 'its', 'own', 'stirner', 'argued', 'that', 'most', 'commonly', 'accepted', 'social', 'institutions', 'including', 'the', 'notion', 'of', 'state', 'property', 'as', 'a', 'right', 'natural', 'rights', 'in', 'general', 'and', 'the', 'very', 'notion', 'of', 'society', 'were', 'mere', 'illusions', 'or', 'ghosts', 'in', 'the', 'mind', 'saying', 'of', 'society', 'that', 'the', 'individuals', 'are', 'its', 'reality', 'he', 'advocated', 'egoism', 'and', 'a', 'form', 'of', 'amoralism', 'in', 'which', 'individuals', 'would', 'unite', 'in', 'associations', 'of', 'egoists', 'only', 'when', 'it', 'was', 'in', 'their', 'self', 'interest', 'to', 'do', 'so', 'for', 'him', 'property', 'simply', 'comes', 'about', 'through', 'might', 'whoever', 'knows', 'how', 'to', 'take', 'to', 'defend', 'the', 'thing', 'to', 'him', 'belongs', 'property', 'and', 'what', 'i', 'have', 'in', 'my', 'power', 'that', 'is', 'my', 'own', 'so', 'long', 'as', 'i', 'assert', 'myself', 'as', 'holder', 'i', 'am', 'the', 'proprietor', 'of', 'the', 'thing', 'stirner', 'never', 'called', 'himself', 'an', 'anarchist', 'he', 'accepted', 'only', 'the', 'label', 'egoist', 'nevertheless', 'his', 'ideas', 'were', 'influential', 'on', 'many', 'individualistically', 'inclined', 'anarchists', 'although', 'interpretations', 'of', 'his', 'thought', 'are', 'diverse', 'american', 'individualist', 'anarchism', 'benjamin', 'tucker', 'in', 'one', 'eight', 'two', 'five', 'josiah', 'warren', 'had', 'participated', 'in', 'a', 'communitarian', 'experiment', 'headed', 'by', 'robert', 'owen', 'called', 'new', 'harmony', 'which', 'failed', 'in', 'a', 'few', 'years', 'amidst', 'much', 'internal', 'conflict', 'warren', 'blamed', 'the', 'community', 's', 'failure', 'on', 'a', 'lack', 'of', 'individual', 'sovereignty', 'and', 'a', 'lack', 'of', 'private', 'property', 'warren', 'proceeded', 'to', 'organise', 'experimenal', 'anarchist', 'communities', 'which', 'respected', 'what', 'he', 'called', 'the', 'sovereignty', 'of', 'the', 'individual', 'at', 'utopia', 'and', 'modern', 'times', 'in', 'one', 'eight', 'three', 'three', 'warren', 'wrote', 'and', 'published', 'the', 'peaceful', 'revolutionist', 'which', 'some', 'have', 'noted', 'to', 'be', 'the', 'first', 'anarchist', 'periodical', 'ever', 'published', 'benjamin', 'tucker', 'says', 'that', 'warren', 'was', 'the', 'first', 'man', 'to', 'expound', 'and', 'formulate', 'the', 'doctrine', 'now', 'known', 'as', 'anarchism', 'liberty', 'xiv', 'december', 'one', 'nine', 'zero', 'zero', 'one', 'benjamin', 'tucker', 'became', 'interested', 'in', 'anarchism', 'through', 'meeting', 'josiah', 'warren', 'and', 'william', 'b', 'greene', 'he', 'edited', 'and', 'published', 'liberty', 'from', 'august', 'one', 'eight', 'eight', 'one', 'to', 'april', 'one', 'nine', 'zero', 'eight', 'it', 'is', 'widely', 'considered', 'to', 'be', 'the', 'finest', 'individualist', 'anarchist', 'periodical', 'ever', 'issued', 'in', 'the', 'english', 'language', 'tucker', 's', 'conception', 'of', 'individualist', 'anarchism', 'incorporated', 'the', 'ideas', 'of', 'a', 'variety', 'of', 'theorists', 'greene', 's', 'ideas', 'on', 'mutual', 'banking', 'warren', 's', 'ideas', 'on', 'cost', 'as', 'the', 'limit', 'of', 'price', 'a', 'heterodox', 'variety', 'of', 'labour', 'theory', 'of', 'value', 'proudhon', 's', 'market', 'anarchism', 'max', 'stirner', 's', 'egoism', 'and', 'herbert', 'spencer', 's', 'law', 'of', 'equal', 'freedom', 'tucker', 'strongly', 'supported', 'the', 'individual', 's', 'right', 'to', 'own', 'the', 'product', 'of', 'his', 'or', 'her', 'labour', 'as', 'private', 'property', 'and', 'believed', 'in', 'a', 'market', 'economy', 'for', 'trading', 'this', 'property', 'he', 'argued', 'that', 'in', 'a', 'truly', 'free', 'market', 'system', 'without', 'the', 'state', 'the', 'abundance', 'of', 'competition', 'would', 'eliminate', 'profits', 'and', 'ensure', 'that', 'all', 'workers', 'received', 'the', 'full', 'value', 'of', 'their', 'labor', 'other', 'one', 'nine', 'th', 'century', 'individualists', 'included', 'lysander', 'spooner', 'stephen', 'pearl', 'andrews', 'and', 'victor', 'yarros', 'the', 'first', 'international', 'mikhail', 'bakunin', 'one', 'eight', 'one', 'four', 'one', 'eight', 'seven', 'six', 'in', 'europe', 'harsh', 'reaction', 'followed', 'the', 'revolutions', 'of', 'one', 'eight', 'four', 'eight', 'twenty', 'years', 'later', 'in', 'one', 'eight', 'six', 'four', 'the', 'international', 'workingmen', 's', 'association', 'sometimes', 'called', 'the', 'first', 'international', 'united', 'some', 'diverse', 'european', 'revolutionary', 'currents', 'including', 'anarchism', 'due', 'to', 'its', 'genuine', 'links', 'to', 'active', 'workers', 'movements', 'the', 'international', 'became', 'signficiant', 'from', 'the', 'start', 'karl', 'marx', 'was', 'a', 'leading', 'figure', 'in', 'the', 'international', 'he', 'was', 'elected', 'to', 'every', 'succeeding', 'general', 'council', 'of', 'the', 'association', 'the', 'first', 'objections', 'to', 'marx', 'came', 'from', 'the', 'mutualists', 'who', 'opposed', 'communism', 'and', 'statism', 'shortly', 'after', 'mikhail', 'bakunin', 'and', 'his', 'followers', 'joined', 'in', 'one', 'eight', 'six', 'eight', 'the', 'first', 'international', 'became', 'polarised', 'into', 'two', 'camps', 'with', 'marx', 'and', 'bakunin', 'as', 'their', 'respective', 'figureheads', 'the', 'clearest', 'difference', 'between', 'the', 'camps', 'was', 'over', 'strategy', 'the', 'anarchists', 'around', 'bakunin', 'favoured', 'in', 'kropotkin', 's', 'words', 'direct', 'economical', 'struggle', 'against', 'capitalism', 'without', 'interfering', 'in', 'the', 'political', 'parliamentary', 'agitation', 'at', 'that', 'time', 'marx', 'and', 'his', 'followers', 'focused', 'on', 'parliamentary', 'activity', 'bakunin', 'characterised', 'marx', 's', 'ideas', 'as', 'authoritarian', 'and', 'predicted', 'that', 'if', 'a', 'marxist', 'party', 'gained', 'to', 'power', 'its', 'leaders', 'would', 'end', 'up', 'as', 'bad', 'as', 'the', 'ruling', 'class', 'they', 'had', 'fought', 'against', 'in', 'one', 'eight', 'seven', 'two', 'the', 'conflict', 'climaxed', 'with', 'a', 'final', 'split', 'between', 'the', 'two', 'groups', 'at', 'the', 'hague', 'congress', 'this', 'is', 'often', 'cited', 'as', 'the', 'origin', 'of', 'the', 'conflict', 'between', 'anarchists', 'and', 'marxists', 'from', 'this', 'moment', 'the', 'social', 'democratic', 'and', 'libertarian', 'currents', 'of', 'socialism', 'had', 'distinct', 'organisations', 'including', 'rival', 'internationals', 'anarchist', 'communism', 'peter', 'kropotkin', 'proudhon', 'and', 'bakunin', 'both', 'opposed', 'communism', 'associating', 'it', 'with', 'statism', 'however', 'in', 'the', 'one', 'eight', 'seven', 'zero', 's', 'many', 'anarchists', 'moved', 'away', 'from', 'bakunin', 's', 'economic', 'thinking', 'called', 'collectivism', 'and', 'embraced', 'communist', 'concepts', 'communists', 'believed', 'the', 'means', 'of', 'production', 'should', 'be', 'owned', 'collectively', 'and', 'that', 'goods', 'be', 'distributed', 'by', 'need', 'not', 'labor', 'an', 'early', 'anarchist', 'communist', 'was', 'joseph', 'd', 'jacque', 'the', 'first', 'person', 'to', 'describe', 'himself', 'as', 'libertarian', 'unlike', 'proudhon', 'he', 'argued', 'that', 'it', 'is', 'not', 'the', 'product', 'of', 'his', 'or', 'her', 'labor', 'that', 'the', 'worker', 'has', 'a', 'right', 'to', 'but', 'to', 'the', 'satisfaction', 'of', 'his', 'or', 'her', 'needs', 'whatever', 'may', 'be', 'their', 'nature', 'he', 'announced', 'his', 'ideas', 'in', 'his', 'us', 'published', 'journal', 'le', 'libertaire', 'one', 'eight', 'five', 'eight', 'one', 'eight', 'six', 'one', 'peter', 'kropotkin', 'often', 'seen', 'as', 'the', 'most', 'important', 'theorist', 'outlined', 'his', 'economic', 'ideas', 'in', 'the', 'conquest', 'of', 'bread', 'and', 'fields', 'factories', 'and', 'workshops', 'he', 'felt', 'co', 'operation', 'is', 'more', 'beneficial', 'than', 'competition', 'illustrated', 'in', 'nature', 'in', 'mutual', 'aid', 'a', 'factor', 'of', 'evolution', 'one', 'eight', 'nine', 'seven', 'subsequent', 'anarchist', 'communists', 'include', 'emma', 'goldman', 'and', 'alexander', 'berkman', 'many', 'in', 'the', 'anarcho', 'syndicalist', 'movements', 'see', 'below', 'saw', 'anarchist', 'communism', 'as', 'their', 'objective', 'isaac', 'puente', 's', 'one', 'nine', 'three', 'two', 'comunismo', 'libertario', 'was', 'adopted', 'by', 'the', 'spanish', 'cnt', 'as', 'its', 'manifesto', 'for', 'a', 'post', 'revolutionary', 'society', 'some', 'anarchists', 'disliked', 'merging', 'communism', 'with', 'anarchism', 'several', 'individualist', 'anarchists', 'maintained', 'that', 'abolition', 'of', 'private', 'property', 'was', 'not', 'consistent', 'with', 'liberty', 'for', 'example', 'benjamin', 'tucker', 'whilst', 'professing', 'respect', 'for', 'kropotkin', 'and', 'publishing', 'his', 'work', 'described', 'communist', 'anarchism', 'as', 'pseudo', 'anarchism', 'propaganda', 'of', 'the', 'deed', 'johann', 'most', 'was', 'an', 'outspoken', 'advocate', 'of', 'violence', 'anarchists', 'have', 'often', 'been', 'portrayed', 'as', 'dangerous', 'and', 'violent', 'due', 'mainly', 'to', 'a', 'number', 'of', 'high', 'profile', 'violent', 'acts', 'including', 'riots', 'assassinations', 'insurrections', 'and', 'terrorism', 'by', 'some', 'anarchists', 'some', 'revolutionaries', 'of', 'the', 'late', 'one', 'nine', 'th', 'century', 'encouraged', 'acts', 'of', 'political', 'violence', 'such', 'as', 'bombings', 'and', 'the', 'assassinations', 'of', 'heads', 'of', 'state', 'to', 'further', 'anarchism', 'such', 'actions', 'have', 'sometimes', 'been', 'called', 'propaganda', 'by', 'the', 'deed', 'one', 'of', 'the', 'more', 'outspoken', 'advocates', 'of', 'this', 'strategy', 'was', 'johann', 'most', 'who', 'said', 'the', 'existing', 'system', 'will', 'be', 'quickest', 'and', 'most', 'radically', 'overthrown', 'by', 'the', 'annihilation', 'of', 'its', 'exponents', 'therefore', 'massacres', 'of', 'the', 'enemies', 'of', 'the', 'people', 'must', 'be', 'set', 'in', 'motion', 'most', 's', 'preferred', 'method', 'of', 'terrorism', 'dynamite', 'earned', 'him', 'the', 'moniker', 'dynamost', 'however', 'there', 'is', 'no', 'consensus', 'on', 'the', 'legitimacy', 'or', 'utility', 'of', 'violence', 'in', 'general', 'mikhail', 'bakunin', 'and', 'errico', 'malatesta', 'for', 'example', 'wrote', 'of', 'violence', 'as', 'a', 'necessary', 'and', 'sometimes', 'desirable', 'force', 'in', 'revolutionary', 'settings', 'but', 'at', 'the', 'same', 'time', 'they', 'denounced', 'acts', 'of', 'individual', 'terrorism', 'malatesta', 'in', 'on', 'violence', 'and', 'bakunin', 'when']\n"
     ]
    }
   ],
   "source": [
    "print('words = ', words[1:2000]);"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most common words (+UNK) [['UNK', 418391], ('the', 1061396), ('of', 593677), ('and', 416629), ('one', 411764), ('in', 372201), ('a', 325873), ('to', 316376), ('zero', 264975), ('nine', 250430)]\n",
      "Sample data [5236, 3082, 12, 6, 195, 2, 3134, 46, 59, 156] ['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']\n"
     ]
    }
   ],
   "source": [
    "# Step 2: Build the dictionary and replace rare words with UNK token.\n",
    "vocabulary_size = 50000\n",
    "\n",
    "def build_dataset(words, vocabulary_size):\n",
    "  count = [['UNK', -1]]\n",
    "  count.extend(collections.Counter(words).most_common(vocabulary_size - 1))\n",
    "  dictionary = dict()\n",
    "  for word, _ in count:\n",
    "    dictionary[word] = len(dictionary)\n",
    "  data = list()\n",
    "  unk_count = 0\n",
    "  for word in words:\n",
    "    if word in dictionary:\n",
    "      index = dictionary[word]\n",
    "    else:\n",
    "      index = 0  # dictionary['UNK']\n",
    "      unk_count += 1\n",
    "    data.append(index)\n",
    "  count[0][1] = unk_count\n",
    "  reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n",
    "  return data, count, dictionary, reverse_dictionary\n",
    "\n",
    "data, count, dictionary, reverse_dictionary = build_dataset(words, vocabulary_size)\n",
    "del words  # Hint to reduce memory.\n",
    "print('Most common words (+UNK)', count[:10])\n",
    "print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]])\n",
    "\n",
    "data_index = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3082 originated -> 5236 anarchism\n",
      "3082 originated -> 12 as\n",
      "12 as -> 3082 originated\n",
      "12 as -> 6 a\n",
      "6 a -> 195 term\n",
      "6 a -> 12 as\n",
      "195 term -> 6 a\n",
      "195 term -> 2 of\n"
     ]
    }
   ],
   "source": [
    "# Step 3: Function to generate a training batch for the skip-gram model.\n",
    "def generate_batch(batch_size, num_skips, skip_window):\n",
    "  global data_index\n",
    "  assert batch_size % num_skips == 0\n",
    "  assert num_skips <= 2 * skip_window\n",
    "  batch = np.ndarray(shape=(batch_size), dtype=np.int32)\n",
    "  labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n",
    "  span = 2 * skip_window + 1  # [ skip_window target skip_window ]\n",
    "  buffer = collections.deque(maxlen=span)\n",
    "  for _ in range(span):\n",
    "    buffer.append(data[data_index])\n",
    "    data_index = (data_index + 1) % len(data)\n",
    "  for i in range(batch_size // num_skips):\n",
    "    target = skip_window  # target label at the center of the buffer\n",
    "    targets_to_avoid = [skip_window]\n",
    "    for j in range(num_skips):\n",
    "      while target in targets_to_avoid:\n",
    "        target = random.randint(0, span - 1)\n",
    "      targets_to_avoid.append(target)\n",
    "      batch[i * num_skips + j] = buffer[skip_window]\n",
    "      labels[i * num_skips + j, 0] = buffer[target]\n",
    "    buffer.append(data[data_index])\n",
    "    data_index = (data_index + 1) % len(data)\n",
    "  # Backtrack a little bit to avoid skipping words in the end of a batch\n",
    "  data_index = (data_index + len(data) - span) % len(data)\n",
    "  return batch, labels\n",
    "\n",
    "batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1)\n",
    "for i in range(8):\n",
    "  print(batch[i], reverse_dictionary[batch[i]],\n",
    "        '->', labels[i, 0], reverse_dictionary[labels[i, 0]])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Note to Self: Try to run previous command again"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "embeddings <tf.Variable 'Variable:0' shape=(50000, 128) dtype=float32_ref>\n",
      "embed Tensor(\"embedding_lookup:0\", shape=(128, 128), dtype=float32, device=/device:CPU:0)\n",
      "nce_weights <tf.Variable 'Variable_1:0' shape=(50000, 128) dtype=float32_ref>\n",
      "nce_biases <tf.Variable 'Variable_2:0' shape=(50000,) dtype=float32_ref>\n"
     ]
    }
   ],
   "source": [
    "# Step 4: Build and train a skip-gram model.\n",
    "\n",
    "batch_size = 128\n",
    "embedding_size = 128  # Dimension of the embedding vector.\n",
    "skip_window = 1       # How many words to consider left and right.\n",
    "num_skips = 2         # How many times to reuse an input to generate a label.\n",
    "\n",
    "# We pick a random validation set to sample nearest neighbors. Here we limit the\n",
    "# validation samples to the words that have a low numeric ID, which by\n",
    "# construction are also the most frequent.\n",
    "valid_size = 16     # Random set of words to evaluate similarity on.\n",
    "valid_window = 100  # Only pick dev samples in the head of the distribution.\n",
    "valid_examples = np.random.choice(valid_window, valid_size, replace=False)\n",
    "num_sampled = 64    # Number of negative examples to sample.\n",
    "\n",
    "graph = tf.Graph()\n",
    "\n",
    "with graph.as_default():\n",
    "\n",
    "  # Input data.\n",
    "  train_inputs = tf.placeholder(tf.int32, shape=[batch_size])\n",
    "  train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])\n",
    "  valid_dataset = tf.constant(valid_examples, dtype=tf.int32)\n",
    "\n",
    "  # Ops and variables pinned to the CPU because of missing GPU implementation\n",
    "  with tf.device('/cpu:0'):\n",
    "    # Look up embeddings for inputs.\n",
    "    embeddings = tf.Variable(\n",
    "        tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n",
    "    embed = tf.nn.embedding_lookup(embeddings, train_inputs)\n",
    "    print('embeddings', embeddings);\n",
    "    print('embed', embed);\n",
    "    \n",
    "    # Construct the variables for the NCE loss\n",
    "    nce_weights = tf.Variable(\n",
    "        tf.truncated_normal([vocabulary_size, embedding_size],\n",
    "                            stddev=1.0 / math.sqrt(embedding_size)))\n",
    "    nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n",
    "    print('nce_weights', nce_weights);\n",
    "    print('nce_biases', nce_biases);\n",
    "\n",
    "  # Compute the average NCE loss for the batch.\n",
    "  # tf.nce_loss automatically draws a new sample of the negative labels each\n",
    "  # time we evaluate the loss.\n",
    "  loss = tf.reduce_mean(\n",
    "      tf.nn.nce_loss(weights=nce_weights,\n",
    "                     biases=nce_biases,\n",
    "                     labels=train_labels,\n",
    "                     inputs=embed,\n",
    "                     num_sampled=num_sampled,\n",
    "                     num_classes=vocabulary_size))\n",
    "\n",
    "  # Construct the SGD optimizer using a learning rate of 1.0.\n",
    "  optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)\n",
    "\n",
    "  # Compute the cosine similarity between minibatch examples and all embeddings.\n",
    "  norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n",
    "  normalized_embeddings = embeddings / norm\n",
    "  valid_embeddings = tf.nn.embedding_lookup(\n",
    "      normalized_embeddings, valid_dataset)\n",
    "  similarity = tf.matmul(\n",
    "      valid_embeddings, normalized_embeddings, transpose_b=True)\n",
    "\n",
    "  # Add variable initializer.\n",
    "  init = tf.global_variables_initializer()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialized\n",
      "Average loss at step  0 :  274.29788208\n",
      "Nearest to more: rhetorician, mich, rates, porpoises, spill, prestige, avoided, holiest,\n",
      "Nearest to into: decompose, eon, winding, disaccharide, ribose, pepin, obsolescence, collectivity,\n",
      "Nearest to no: khoisan, upanishads, snacks, exarch, merit, hidalgo, fruits, gnp,\n",
      "Nearest to was: unn, fayette, alf, dances, paralleled, schiavo, carmelite, envoy,\n",
      "Nearest to had: glaciers, bumpy, eucharistic, livonia, mckinley, listened, glaring, esotericism,\n",
      "Nearest to one: abdur, rochester, azeglio, wavelet, notwithstanding, aggressors, rale, sammon,\n",
      "Nearest to nine: hemispheres, tolls, mongoloids, thermionic, czechs, dona, levelled, complications,\n",
      "Nearest to such: really, entail, gunfight, cathode, freshly, progressive, node, paleolithic,\n",
      "Nearest to be: hypocrisy, combats, morocco, offense, interpretive, senile, johns, taipa,\n",
      "Nearest to other: reproductions, fermium, divisors, oppenheim, gacy, modulation, pretoria, karlsruhe,\n",
      "Nearest to are: reliability, colorless, magician, fullwidth, hire, hamad, sextans, molecules,\n",
      "Nearest to have: downtown, soften, emilio, shih, mixed, permutations, router, cruciform,\n",
      "Nearest to during: pdf, arterial, malm, unalaska, language, guitarist, marshes, uganda,\n",
      "Nearest to in: honduran, wieman, alcindor, subscriptions, lcms, visualized, letsie, vitamin,\n",
      "Nearest to by: classes, strung, pale, entombed, labeled, disregarded, steak, loving,\n",
      "Nearest to up: galeon, mechanised, midtown, condemned, grammer, quarrel, puncture, sampa,\n",
      "Average loss at step  2000 :  113.909365058\n",
      "Average loss at step  4000 :  52.070139909\n",
      "Average loss at step  6000 :  33.6091925418\n",
      "Average loss at step  8000 :  23.5522460344\n",
      "Average loss at step  10000 :  17.8713903297\n",
      "Nearest to more: avoided, aristotle, archie, rates, lafayette, provides, theologians, mich,\n",
      "Nearest to into: archie, cardinality, nonetheless, oxford, lenoir, warned, today, thalamus,\n",
      "Nearest to no: aberdeenshire, a, vocals, khoisan, production, invertebrate, bckgr, deke,\n",
      "Nearest to was: is, lords, had, murders, UNK, five, were, bckgr,\n",
      "Nearest to had: glaciers, release, was, existent, mckinley, carolina, imagine, pads,\n",
      "Nearest to one: archie, two, UNK, bckgr, zero, var, rudolph, coke,\n",
      "Nearest to nine: zero, eight, phi, six, four, agave, altenberg, archie,\n",
      "Nearest to such: cathode, capable, really, soon, entail, mikhail, casey, native,\n",
      "Nearest to be: agave, bodyline, ar, morocco, suez, erebus, pine, hypocrisy,\n",
      "Nearest to other: modulation, hell, primarily, isomers, milk, obtained, citing, convinced,\n",
      "Nearest to are: were, is, r, cia, basins, as, bckgr, molecules,\n",
      "Nearest to have: downtown, mosque, seeks, emilio, bodyline, assumes, survive, rudolph,\n",
      "Nearest to during: in, pdf, language, agave, guitarist, and, affect, persia,\n",
      "Nearest to in: and, of, from, bckgr, phi, on, for, at,\n",
      "Nearest to by: classes, and, in, with, of, mathbf, genera, from,\n",
      "Nearest to up: phi, week, condemned, agave, reformed, above, quarrel, conflict,\n",
      "Average loss at step  12000 :  14.0249966706\n",
      "Average loss at step  14000 :  11.769307356\n",
      "Average loss at step  16000 :  10.0291605637\n",
      "Average loss at step  18000 :  8.4440647254\n",
      "Average loss at step  20000 :  8.03261141276\n",
      "Nearest to more: avoided, theologians, circ, rhetorician, aristotle, operatorname, lafayette, blackmail,\n",
      "Nearest to into: and, on, of, meals, pepin, in, circ, cardinality,\n",
      "Nearest to no: a, UNK, aberdeenshire, vocals, production, subkey, khoisan, volume,\n",
      "Nearest to was: is, were, by, had, are, operatorname, and, been,\n",
      "Nearest to had: was, have, pyruvate, and, as, is, existent, also,\n",
      "Nearest to one: two, three, eight, archie, six, nine, five, seven,\n",
      "Nearest to nine: eight, six, seven, zero, five, four, three, two,\n",
      "Nearest to such: casey, really, soon, mikhail, fricatives, cathode, native, circ,\n",
      "Nearest to be: agave, is, bodyline, have, by, ar, courtney, erebus,\n",
      "Nearest to other: hell, modulation, operatorname, isomers, milk, primarily, scatter, distributed,\n",
      "Nearest to are: were, is, was, in, and, r, basins, as,\n",
      "Nearest to have: had, be, bodyline, downtown, were, has, seeks, assumes,\n",
      "Nearest to during: in, and, pdf, language, guitarist, agave, wwe, marshes,\n",
      "Nearest to in: and, of, on, at, from, operatorname, for, phi,\n",
      "Nearest to by: classes, with, was, in, and, been, is, as,\n",
      "Nearest to up: week, condemned, reformed, phi, depiction, above, quarrel, puncture,\n"
     ]
    }
   ],
   "source": [
    "# Step 5: Begin training.\n",
    "num_steps = 20001\n",
    "\n",
    "with tf.Session(graph=graph) as session:\n",
    "  # We must initialize all variables before we use them.\n",
    "  init.run()\n",
    "  print(\"Initialized\")\n",
    "\n",
    "  average_loss = 0\n",
    "  for step in xrange(num_steps):\n",
    "    batch_inputs, batch_labels = generate_batch(\n",
    "        batch_size, num_skips, skip_window)\n",
    "    \n",
    "#     print('batch_inputs = ', batch_inputs);\n",
    "#     print('batch_labels = ', batch_labels);\n",
    "    \n",
    "    feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}\n",
    "\n",
    "    # We perform one update step by evaluating the optimizer op (including it\n",
    "    # in the list of returned values for session.run()\n",
    "    _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)\n",
    "    average_loss += loss_val\n",
    "\n",
    "    if step % 2000 == 0:\n",
    "      if step > 0:\n",
    "        average_loss /= 2000\n",
    "      # The average loss is an estimate of the loss over the last 2000 batches.\n",
    "      print(\"Average loss at step \", step, \": \", average_loss)\n",
    "      average_loss = 0\n",
    "\n",
    "    # Note that this is expensive (~20% slowdown if computed every 500 steps)\n",
    "    if step % 10000 == 0:\n",
    "      sim = similarity.eval()\n",
    "      for i in xrange(valid_size):\n",
    "        valid_word = reverse_dictionary[valid_examples[i]]\n",
    "        top_k = 8  # number of nearest neighbors\n",
    "        nearest = (-sim[i, :]).argsort()[1:top_k + 1]\n",
    "        log_str = \"Nearest to %s:\" % valid_word\n",
    "        for k in xrange(top_k):\n",
    "          close_word = reverse_dictionary[nearest[k]]\n",
    "          log_str = \"%s %s,\" % (log_str, close_word)\n",
    "        print(log_str)\n",
    "  final_embeddings = normalized_embeddings.eval()\n",
    "#   print('sim = ',sim, tf.shape(sim).eval(), tf.rank(sim).eval());\n",
    "#   print('final_embeddings = ',final_embeddings, tf.shape(final_embeddings).eval(), tf.rank(final_embeddings).eval());"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Step 6: Visualize the embeddings.\n",
    "\n",
    "def plot_with_labels(low_dim_embs, labels, filename='tsne.png'):\n",
    "  assert low_dim_embs.shape[0] >= len(labels), \"More labels than embeddings\"\n",
    "  plt.figure(figsize=(18, 18))  # in inches\n",
    "  for i, label in enumerate(labels):\n",
    "    x, y = low_dim_embs[i, :]\n",
    "    plt.scatter(x, y)\n",
    "    plt.annotate(label,\n",
    "                 xy=(x, y),\n",
    "                 xytext=(5, 2),\n",
    "                 textcoords='offset points',\n",
    "                 ha='right',\n",
    "                 va='bottom')\n",
    "\n",
    "  plt.savefig(filename)\n",
    "\n",
    "try:\n",
    "  from sklearn.manifold import TSNE\n",
    "  import matplotlib.pyplot as plt\n",
    "\n",
    "  tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000)\n",
    "  plot_only = 500\n",
    "  low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :])\n",
    "  labels = [reverse_dictionary[i] for i in xrange(plot_only)]\n",
    "  plot_with_labels(low_dim_embs, labels)\n",
    "\n",
    "except ImportError:\n",
    "  print(\"Please install sklearn, matplotlib, and scipy to visualize embeddings.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
